In [1]:
import sys,os,gzip
from collections import defaultdict
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
%load_ext sql
In [2]:
sys.path.append(os.path.sep.join(os.path.split(os.getcwd())[:-1]))
In [3]:
import splitter
Our test set here includes the 16 million molecules from the old ZINC clean set that could be successfully processed by the RDKit.
We use the Standard InChI that comes with ChEMBL and a non-standard InChI (options "/FixedH /SUU") that allows tautomers to be distinguished. Here's the sequence of psql commands used to generate that set:
In [4]:
%sql postgresql://localhost/inchi_split \
select count(*) from zinc_clean_nonstandard;
Out[4]:
Big caveat here: I forgot the last commit in my loading script, so the last block of structures is missing.
In [5]:
d = %sql \
select formula,count(zinc_id) freq from zinc_clean_nonstandard group by formula \
order by freq desc limit 10;
d
Out[5]:
In [6]:
d = %sql \
select formula,skeleton,hydrogens,count(zinc_id) freq from zinc_clean_nonstandard group by \
(formula,skeleton,hydrogens) \
order by freq desc limit 10;
Look at a few of the common main layer groups
In [7]:
d[:5]
Out[7]:
In [13]:
tpl=d[0][:-1]
print(tpl)
rows = %sql \
select zinc_id,smiles from zinc_clean join zinc_clean_nonstandard using (zinc_id) where \
(formula,skeleton,hydrogens) = :tpl
cids = [x for x,y in rows][:9]
ms = [Chem.MolFromSmiles(y) for x,y in rows][:9]
Draw.MolsToGridImage(ms,legends=cids)
Out[13]:
In [14]:
tpl=d[1][:-1]
print(tpl)
rows = %sql \
select zinc_id,smiles from zinc_clean join zinc_clean_nonstandard using (zinc_id) where \
(formula,skeleton,hydrogens) = :tpl
cids = [x for x,y in rows][:9]
ms = [Chem.MolFromSmiles(y) for x,y in rows][:9]
Draw.MolsToGridImage(ms,legends=cids)
Out[14]:
In [15]:
tpl=d[4][:-1]
print(tpl)
rows = %sql \
select zinc_id,smiles from zinc_clean join zinc_clean_nonstandard using (zinc_id) where \
(formula,skeleton,hydrogens) = :tpl
cids = [x for x,y in rows][:9]
ms = [Chem.MolFromSmiles(y) for x,y in rows][:9]
Draw.MolsToGridImage(ms,legends=cids)
Out[15]:
In [16]:
d = %sql \
select formula,skeleton,hydrogens,charge,protonation,count(zinc_id) freq from zinc_clean_nonstandard group by \
(formula,skeleton,hydrogens,charge,protonation) \
order by freq desc limit 10;
d[:5]
Out[16]:
In [21]:
tpl=d[0][:-1]
tpl = tuple(x if x is not None else '' for x in tpl)
print(tpl)
rows = %sql \
select zinc_id,smiles from zinc_clean join zinc_clean_nonstandard using (zinc_id) where \
(formula,skeleton,hydrogens,coalesce(charge,''),coalesce(protonation,'')) = :tpl
cids = [x for x,y in rows][:9]
ms = [Chem.MolFromSmiles(y) for x,y in rows][:9]
Draw.MolsToGridImage(ms,legends=cids)
Out[21]:
In [26]:
d = %sql \
select formula,skeleton,hydrogens,charge,protonation,stereo_bond,stereo_tet,stereo_m,stereo_s,count(zinc_id) freq \
from zinc_clean_nonstandard where stereo_bond is not null or stereo_tet is not null \
group by \
(formula,skeleton,hydrogens,charge,protonation,stereo_bond,stereo_tet,stereo_m,stereo_s) \
order by freq desc limit 10;
d[:5]
Out[26]:
In [27]:
tpl=d[0][:-1]
tpl = tuple(x if x is not None else '' for x in tpl)
print(tpl)
rows = %sql \
select zinc_id,smiles from zinc_clean join zinc_clean_nonstandard using (zinc_id) where \
(formula,skeleton,hydrogens,\
coalesce(charge,''),coalesce(protonation,''),coalesce(stereo_bond,''),\
coalesce(stereo_tet,''),coalesce(stereo_m,''),coalesce(stereo_s,'')) = :tpl
cids = [x for x,y in rows]
ms = [Chem.MolFromSmiles(y) for x,y in rows]
Draw.MolsToGridImage(ms,legends=cids)
Out[27]:
In [28]:
tpl=d[1][:-1]
tpl = tuple(x if x is not None else '' for x in tpl)
print(tpl)
rows = %sql \
select zinc_id,smiles from zinc_clean join zinc_clean_nonstandard using (zinc_id) where \
(formula,skeleton,hydrogens,\
coalesce(charge,''),coalesce(protonation,''),coalesce(stereo_bond,''),\
coalesce(stereo_tet,''),coalesce(stereo_m,''),coalesce(stereo_s,'')) = :tpl
cids = [x for x,y in rows]
ms = [Chem.MolFromSmiles(y) for x,y in rows]
Draw.MolsToGridImage(ms,legends=cids)
Out[28]:
In [30]:
td = %sql \
select t2.zinc_id,t2.nonstandard_inchi,t2.smiles from zinc_clean_nonstandard t1 join zinc_clean t2 using (zinc_id) \
where (formula,skeleton,hydrogens,charge)=\
('/C29H33N2','/c1-28(2)22-16-12-14-18-24(22)30(5)26(28)20-10-8-7-9-11-21-27-29(3,4)23-17-13-15-19-25(23)31(27)6',\
'/h7-21H,1-6H3','/q+1')
print(td)
cids = [x for x,y,z in td]
ms = [Chem.MolFromSmiles(z) for x,y,z in td]
Draw.MolsToGridImage(ms,legends=cids)
Out[30]:
Sucks to be you if it's important to you that those molecules be different and you're using InChI. Note that at least ZINC12405219 and ZINC19940218 are, according to ZINC, separately available from vendors
In [42]:
%sql \
select count(*) \
from zinc_clean_nonstandard where isotope is not null
Out[42]:
No need here, this set has no labelled compounds. That's likely a property of how the ZINC clean set was constructed.
In [57]:
rows = %sql \
select zinc_id,smiles,nonstandard_inchi from zinc_clean join zinc_clean_nonstandard using (zinc_id) where \
fixedh_stereo_tet is not null and position('?' in fixedh_stereo_tet)<=0 and stereo_tet!=fixedh_stereo_tet
len(rows)
Out[57]:
In [58]:
cids = [x for x,y,z in rows][:10]
ms = [Chem.MolFromSmiles(y) for x,y,z in rows][:10]
Draw.MolsToGridImage(ms,legends=cids)
Out[58]:
Not much interesting there. There's no simple query to find questionable tautomer motion. :-)
In [68]:
rows = %sql \
select zinc_id,smiles,nonstandard_inchi from zinc_clean join zinc_clean_nonstandard using (zinc_id) where \
fixedh_stereo_bond is not null and fixedh_stereo_bond!='/b' and position('?' in fixedh_stereo_bond)<=0 and stereo_bond!=fixedh_stereo_bond
len(rows)
Out[68]:
In [69]:
cids = [x for x,y,z in rows][:10]
ms = [Chem.MolFromSmiles(y) for x,y,z in rows][:10]
Draw.MolsToGridImage(ms,legends=cids)
Out[69]:
Not much interesting in those first results
In [ ]: